In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.model_selection import ParameterGrid
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
In [2]:
def load_embeddings():
    """
    Dataset load in pandas dataframe
    :return: scaled & encoded data
    """
    # loading  dataset
    dataset = pd.read_csv("lab_courses_2015.csv", sep=';')

    # checking data shape
    row, col = dataset.shape
    print(f'There are {row} rows and {col} columns') 
    print(dataset.head(10))
    

    # to work on copy of the data
    dataset_scaled = dataset.copy()

    # Scaling the data to keep the different attributes in same range.
    dataset_scaled[dataset_scaled.columns] = StandardScaler().fit_transform(dataset_scaled)
    print(dataset_scaled.describe())

    return dataset_scaled
In [3]:
def pca_embeddings(df_scaled):
    """Reduce the dimensions of dataset with use of Principal Component Analysis (PCA).
    :param df_scaled: scaled data
    :return: pca result, pca for plotting graph
    """

    pca_2 = PCA(n_components=2)
    pca_2_result = pca_2.fit_transform(df_scaled)
    print('Explained variation per principal component: {}'.format(pca_2.explained_variance_ratio_))
    print('Cumulative variance explained by 2 principal components: {:.2%}'.format(
        np.sum(pca_2.explained_variance_ratio_)))

    # Results from pca.components_
    dataset_pca = pd.DataFrame(abs(pca_2.components_), columns=df_scaled.columns, index=['PC_1', 'PC_2'])
    print('\n\n', dataset_pca)
    
    print("\n*************** Most important features *************************")
    print('As per PC 1:\n', (dataset_pca[dataset_pca > 0.3].iloc[0]).dropna())
    print('\n\nAs per PC 2:\n', (dataset_pca[dataset_pca > 0.3].iloc[1]).dropna())
    print("\n******************************************************************")

    return pca_2_result, pca_2
In [4]:
def visualizing_results(pca_result, label, centroids_pca):
    """ Visualizing the clusters
    :param pca_result: PCA applied data
    :param label: K Means labels
    :param centroids_pca: PCA format K Means centroids
    """
    # ------------------ Using Matplotlib for plotting-----------------------
    x = pca_result[:, 0]
    y = pca_result[:, 1]   
        
    plt.figure(figsize=(14,7), dpi=1500)
    plt.scatter(x, y, c=label, alpha=0.6, s=200, edgecolors="black")  # plot different colors per cluster
    plt.title('Student clusters')
    plt.xlabel('First Principal Component')
    plt.ylabel('Second Principal Component')

    plt.scatter(centroids_pca[:, 0], centroids_pca[:, 1], marker='X', s=200, linewidths=1.5,
                color='red', edgecolors="black", lw=1.5)

    plt.show()
    plt.savefig('clusters.png')
In [5]:
def main():
    print("1. Loading dataset\n")
    data_scaled = load_embeddings()
    
    print("\n\n2. Reducing via PCA\n")
    pca_result, pca_2 = pca_embeddings(data_scaled)

    # fitting KMeans
    kmeans = KMeans(n_clusters=4)
    kmeans.fit(data_scaled)
    centroids = kmeans.cluster_centers_
    centroids_pca = pca_2.transform(centroids)

    print("\n\n4. Visualizing the data")
    visualizing_results(pca_result, kmeans.labels_, centroids_pca)

if __name__ == "__main__":
    main()
1. Loading dataset

There are 216006 rows and 6 columns
   component  action  target  contextid  contextlevel  contextinstanceid
0          5      10       5     105728            50               1089
1         17      10       6     105844            70              67635
2          5      10       5     105728            50               1089
3          9      10      12     105734            70              67525
4          5      10       5     106176            50               1094
5          5      10       5     106176            50               1094
6          5      10       5     106176            50               1094
7         18      10       6     110766            70              70458
8          5      10       5     105728            50               1089
9         17      10       6     105844            70              67635
          component        action        target     contextid  contextlevel  \
count  2.160060e+05  2.160060e+05  2.160060e+05  2.160060e+05  2.160060e+05   
mean   4.825240e-13 -8.526593e-14 -4.700615e-14 -2.279710e-13  4.432454e-13   
std    1.000002e+00  1.000002e+00  1.000002e+00  1.000002e+00  1.000002e+00   
min   -1.634216e+00 -8.719580e+00 -1.488072e+00 -1.180162e+00 -1.083591e+00   
25%   -8.036008e-01  1.932715e-01 -3.624239e-01 -1.180162e+00 -1.083591e+00   
50%    2.701443e-02  1.932715e-01 -3.624239e-01  2.389723e-01  9.228571e-01   
75%    2.701443e-02  1.932715e-01  2.004002e-01  1.235776e+00  9.228571e-01   
max    2.103553e+00  1.932715e-01  2.733109e+00  1.442257e+00  9.228571e-01   

       contextinstanceid  
count       2.160060e+05  
mean       -2.905845e-13  
std         1.000002e+00  
min        -9.725989e-01  
25%        -9.725989e-01  
50%        -1.993837e-01  
75%         7.680869e-01  
max         1.670854e+00  


2. Reducing via PCA

Explained variation per principal component: [0.54952458 0.20794695]
Cumulative variance explained by 2 principal components: 75.75%


       component    action   target  contextid  contextlevel  contextinstanceid
PC_1   0.472440  0.093180  0.04890   0.470343      0.509519           0.533756
PC_2   0.317474  0.535157  0.75652   0.056370      0.192524           0.015838

*************** Most important features *************************
As per PC 1:
 component            0.472440
contextid            0.470343
contextlevel         0.509519
contextinstanceid    0.533756
Name: PC_1, dtype: float64


As per PC 2:
 component    0.317474
action       0.535157
target       0.756520
Name: PC_2, dtype: float64

******************************************************************
C:\Users\user\anaconda3\lib\site-packages\sklearn\base.py:450: UserWarning: X does not have valid feature names, but PCA was fitted with feature names
  warnings.warn(

4. Visualizing the data
<Figure size 432x288 with 0 Axes>